************************************************************************
*** This do file generates Data for the Survey Analysis used in the paper,
***		 i.e. for Figure 2 and 
***		      for the self-reported affair importance by candidate vote (as reported in text).

** Data used (Stata 14 format): 
* medw_raw.dta is the survey data main file. 
* 		This is a subset of variables from "SURV_GER_NAT_Bavaria_11_27_2013_cleaned.dta" 
* 		This was kindly provided by the Making Electoral Democracy Work project (special thanks to Steffen Zittlau and Thomas Gschwend)
* aggregate_treat.dta for treatment variables at district level
* candidates_csu_replication.dta for treatment variable at candidate level
* medw_candidatelist_edited.dta holds labels for candidate vote variable
* csu_smd_candidates_2013.dta holds the names and SMD ids 
* cem_model43.dta is used to identify those candidates included in the CEM matching solution (from Table 4 Model 4.3)

** Data created:
* medw_out.dta
* medw_out_forgraph.dta

********************************************************************************************************************************************************

version 14 
set more off

* set directory here (the folder where the subfolders are located)
* global repldirjop "insert here"
cd "$repldirjop"

capture log close
log using "survey\survey_2014.log", replace

**********************
*** Treatment vars ***
**********************

use "tables\aggregate_treat.dta", clear 
drop if  year!=2013

keep nr affair_run affair_norun
ren nr stk_num
save "survey\tmp.dta", replace
* district is district-number-2013 here

**************
*** Survey ***
**************

use "survey\medw_raw.dta", clear

drop if POST_COMPLETE_W2 == 0 // completed Bavaria post-elec wave: 4697 cases

*** Stimmkreis/SMD number 
destring VARC, replace
* NOTE: VARC is labeled as 'national electoral district', but the variable label is incorrect in the raw data (the id for the 45 national election district is in VARB)
so VARC
ren VARC stk_num 
mvdecode stk_num, mv(999)
count if stk_num == . // N=380
di r(N)/_N // 8.1%
drop if stk_num == . // remain: N=4317


**************************
*** Add treatment vars ***
**************************

merge m:1 stk_num using "survey\tmp.dta"
drop _m
erase "survey\tmp.dta"

*************************
*** indepvars ***********
*************************

** create alternative-specific vars in long format

* Party id and party most competent dummies
forvalues i = 1/6 {
gen pid`i' = 0 if Q46_W1 != . // create pid var if pid question answered
replace pid`i' = 1 if Q46_W1 == 1 & Q46A_W1 == `i' & (Q46B_W1 == 2 | Q46B_W1 == 3) // 1 if pid yes and feeling somewhat close or very close to party (not if only 'not very close' or don't know)
gen bestdeal`i' = 0  // best party to deal with most imp issue ; consider missing as 0
replace bestdeal`i' = 1 if Q3B_W1 == `i' 
}
* left-right distance
mvdecode Q30A_W1 Q30A?_W1, mv(99)
local letters "A B C D E F"
forvalues i = 1/6 {
local l : word `i' of `letters'
egen lrpmean`i' =  mean(Q30A`l'_W1)
gen lrdistm`i' = abs(Q30A_W1-lrpmean`i')
}


** create respondent-specific vars

*economic voting 
* have policies of Land government made economy better? 
* 1 worse 2 not made a difference 3 better 9 don't know
gen govecopos = Q15A_W1 // dummy for positive evaluation
recode govecopos (3 = 1) (nonmiss = 0) //   (code dont' know as zero)
gen goveconeg = Q15A_W1 // dummy for negative evaluation
recode goveconeg (1 = 1) (nonmiss = 0) //  (code dont' know as zero)

* religion
gen relig = SD3_W1
recode relig (1 = 1) (2 3 = 2) (4 5 6 88 98 = 3)  (99 . = .) 
lab def relig 1	"Catholic" 2 "Protestant" 3	"Other/None"
* Protestant = Evangelical, Protestant; Evangelical Free Church
* Other/None = Other Christian; Jewish; Muslim; Other; No Religion.
* Missing = prefer not to say; missing.
lab val relig relig

* union membership dummy
gen union = PSD1_W2 // anyone in household union member?
recode union (2 9 = 0) // no and don't know as 0

* high education level dummy
gen higheduc = SD4_W1
recode higheduc ( 1 2 3 =  0) (nonmiss = 1) // one if 'technical high secondary' or higher

* age categories
egen resp_agecat = cut(AGE), at(0 30(10)60 100) // >= category label < next cat label
qui tab resp_agecat, gen(resp_agecat_gr)

* Franconia dummy
gen franconia = 0 if stk_num != .
replace franconia = 1 if stk_num > 399 & stk_num < 700

* politically very interested (dummy)
gen vintpol = Q5_W1
recode vintpol (8 9 10 = 1) (1/7 99 = 0) // don't know as zero. three highest categories as one

**************************
*** first vote party   ***
**************************
drop if PQ5_2_W2 != 1 // drop those who did not vote or don't know re turnout. N = 354, remain : 3963.
mvdecode PQ6_W2, mv(99) // Don't know their vote choice. N = 287, 7.2%. Set to missing, but keep obs
drop if stk_num == 712 | stk_num == 206 // drop if SMD ballot didn't have candidates from each of 7 'main' parties (these are non-affair SMDs). lose N=60, 3903 remaining.

lab def parties 1	"CSU" 2	"SPD" 3	"Greens" 4 "Free Voters" 5	"FDP" 6	"Left" 7 "Pirates" 88	"Other" 98	"Invalid vote"
lab val PQ6_W2 parties
tab PQ6_W2 if PQ6_W2 
tab PQ6_W2 if PQ6_W2 != 98 //   Erststimme. Actual: 46.5, 20.4, 8.8, 9.8, 3.3, 2.2, 2.0 

gen pfirstvote = PQ6_W2
recode pfirstvote (7 88 98 = .a) //  pirates 84, other 220 and invalid 37. Total N=341, 8.6% Set to missing, but keep obs
* voted for 6 parties: %
gen pfirstvote_not6 = (pfirstvote == .)
*long format version
forvalues i = 1/6 {
gen pfirstvote`i' = pfirstvote
recode pfirstvote`i' (`i' = 1) (nonmiss = 0 ) 
}

**************************
*** second vote party   **
**************************
mvdecode PQ7_W2, mv(98 99) // as missings: don't remember ; Don't know 
lab val PQ7_W2 parties
tab PQ7_W2
ren PQ7_W2 psecondvote 
gen psecondcsu = psecondvote
recode psecondcsu (1=1) (nonmiss=0)
recode psecondvote (7 88  = .a) //  pirates N= 92, other N= 296.  Set to missing, but keep obs
* voted for 6 parties: %
gen psecondvote_not6 = (psecondvote == .)
*long format version
forvalues i = 1/6 {
gen psecondvote`i' = psecondvote
recode psecondvote`i' (`i' = 1) (nonmiss = 0 ) 
}


******************************************************************
*** self-report of affair being important for voting decisions ***
******************************************************************

gen affair_vimp = PQ4A_W2 
*how important were teh following events for your voting decision(s)? Relatives affair in the state-level parliament
* 4 very imp 3 fairly imp 2 not very imip 1 not imp at all 8 can't remember 9 don't know
* dummy for very important:
recode affair_vimp (4 = 1) (nonmiss = 0) // (8 and 9 as zero)
* dummy for important or very important
gen affair_imp = PQ4A_W2 
recode affair_imp (3 4 = 1) (nonmiss = 0) // (8 and 9 as zero)


*******************************
*** candidate vote variable ***
*******************************

gen wk_num = PQ7_PRE_W2 // regional district number
* cross-check with first digit from SMD number:
gen wkfromstk = floor(stk_num/100)
tab wk_num wkfromstk, row nof
gen wk_inconsistent = 0
replace wk_inconsistent = 1 if wk_num < 99 & wk_num != wkfrom

gen secondvote = .

* NOTE set to missing if there are several inconsistent entries (which still counts as a party vote in a way, ca. N=35 cases)
forvalues i = 1/7 {
replace PQ7_2`i'_W2M1 = . if (PQ7_2`i'_W2M2 != . & PQ7_2`i'_W2M2 != PQ7_2`i'_W2M1) | (PQ7_2`i'_W2M3 != . & PQ7_2`i'_W2M3 != PQ7_2`i'_W2M1)
replace secondvote = PQ7_2`i'_W2M1 if PQ7_2`i'_W2M1 != .
}
egen test = rownonmiss(PQ7_2?_W2M1) // ok, is never greater  than one
assert test <=1
drop test

gen vote_code = secondvote

* merge labels=candidate-names (variable candnumbername)
merge m:1 wk_num vote_code using "survey\medw_candidatelist_edited"
* _m == 1 have vote_code missing
drop if _m == 2 
drop _m 

*************************** Intermediate step: **********************************************************************
** need to remove invalid answers, esp. those referring to SMD-candidate for 2nd vote (who cannot have stood on list in her/his SMD)
* (questionnaire design didn't prevent that these candidate names could be given as answers. (some people might also have moved, but timespan bw pre- and postelectoral wave is short))

gen party_num = psecondvote
recode party_num (88 98 = .)
gen name = itrim(candnumbername)
forvalues i = 0/9 {
replace name = subinstr(name,"`i'","",.)
}

replace secondvote = . if psecondvote != 1 // only CSU candidate votes of interest

* look for matches in file with Stkr-candidates (fuzzy match, requires additional ado)
* (only for CSU candidates, others not of interest)
gen tmpmasterid = _n
reclink stk_num party_num name using "survey\csu_smd_candidates_2013.dta", ///
	idmaster(tmpmasterid) idusing(tmpusingid) gen(match) require(stk_num party_num) minscore(.98) // (picked score after inspection)
* (inspected the fuzzy match and tried different minscore cutoffs)
tab _m if psecondvote == 1 // among CSU 2nd vote voters: 156 = 11% invalid answers
tab _m if psecondvote == 1 & wk_num != 5 // (excludes district where no implicated cand on  list)
replace secondvote = . if _merge == 3 // set these invalid answers to missing
replace candnumbername = "" if _merge == 3 // set these invalid answers to missing

drop party_num-_merge
replace secondvote = . if secondvote == 998 | secondvote == 999 | secondvote == 1 // invalid; don't know; general party vote 
******************* end intermediate step ****************************************************************************************

* psecondcsu == 1 for N = 1409
* candidate info for 1098 = 77.9 % 
count if secondvote != . & psecondcsu == 1
* not available for N=311: party vote 60, invalid 3, don't know 69, missing 179 (includes 156 stk cand invalid answers; rest should be those with several entries)


gen rbez = wk_num
recode rbez (1=3) (2=4) (3=2) (4=6) (5=5) (6=7) (7=1)


gen list_pre = real(word(candnumbername,1)) if psecondvote	== 1
replace list_pre = mod(list_pre,100) if psecondvote	== 1
sum list_pre
gen year = 2013

* now add info for chosen candidate			
merge m:m  year rbez list_pre using  "tables\candidates_csu_replication.dta", keepus(affair) // NOTE :m is due to missings in year = 2008 in using, irrelevant for here
* _m == 1: secondvote ==. 
* _m == 2: cand not mentioned as choice by survey respondents (and 2008 cand from using)
drop year 
drop if _m == 2
drop _m


keep RESPID wk_num stk_num pfirstvote? psecondvote? pid* bestdeal* lr* goveco* relig franconia union higheduc female resp* affair* vintpol  ///
    psecondcsu  candnumbername list_pre rbez wk_inconsistent

reshape long pfirstvote psecondvote pid bestdeal lrdistm, i(RESPID) j(party_num)

save "survey\medw_out.dta", replace 

***************************************************************
** estimate conditional logit for party choice with second vote
***************************************************************

global fcontrols "pid bestdeal lrdistm"
global fcasecntrls "female resp_agecat_gr2-resp_agecat_gr5 higheduc union i.relig franconia  goveco*"

xi: asclogit psecondvote $fcontrols , case(RESPID) alt(party_num) base(1) casevars($fcasecntrls) 
* N = 2845
predict prob_second // mean probability of second vote (also predict for cases out of sample)

* check: is csu vote correctly predicted?
gen csu_correct = 0 if party_num == 1 & psecondvote != . 
replace csu_correct = 1 if csu_correct == 0 & party_num == 1 & ( (psecondvote == 1 & prob_second > .5 & prob_second != .) |  ( psecondvote == 0  & prob_second < .5))
tab csu_correct if party_num == 1
tab csu_correct if party_num == 1 & e(sample)

*********************************
** write data used for graph in R
*********************************

keep if party_num == 1 // data is in long format; need each respondent only once
drop if prob_second == . 
keep affair_vimp prob_second  affair affair_run affair_norun vintpol wk_inconsistent
saveold "survey\medw_out_forgraph.dta", replace version(12)


****************************************************************************
** Self-reported affair importance by candidate vote (as reported in text)
****************************************************************************

use "survey\medw_out.dta", clear
ren list_pre listrank

gen RBEZ_name = ""
local wk "Oberbayern Niederbayern Oberpfalz Oberfranken Mittelfranken Unterfranken Schwaben"
forvalues i = 1/7 {
local w: word `i' of `wk'
replace RBEZ_name = "`w'" if wk_num == `i'
}

keep if party_num == 1 // data is in long format; need each respondent only once
* keep only CSU second vote voters, and outside Mittelfranken (where no candidate on list was implicated) 
keep if psecondcsu == 1 & wk_num != 5 
* merge info re inclusion in matching sample  (by region name and list rank)
* this is matching sample 3 with N = 45
merge m:1 RBEZ listrank using "survey\cem_model43.dta"
drop if _m == 2 // these are N = 12 matched candidates that were never mentioned by survey respondents as their choice
ren _m cem_verybroad
recode cem_very (1 =0) (3 =1)
lab val cem
* N = 103 respondents (from among CSU second vote voters, and outside Mittelfranken) chose any matched candidate (implicated or not)

tab candnumbername, m // candidate vote info among CSU second vote voters outside Mittelfranken
di 1-((144 + 55 ) /1212) // 84% not missing and not Don't know

* Cross-tab of 'vote for implicated candidate' and "self-report: affair important".
* (those who voted for matched candidates, and who also voted CSU with first vote)
tab affair affair_imp if cem_very == 1 & pfirstvote == 1, row
assert wk_inconsistent == 0 if cem_very == 1 & pfirstvote == 1

log close

